######## GAM model with manual hurdle component ##########
library(tidyverse)
## ── Attaching packages ──────────────────────────────────────── tidyverse 1.3.0 ──
## ✔ ggplot2 3.2.1 ✔ purrr 0.3.3
## ✔ tibble 2.1.3 ✔ dplyr 0.8.3
## ✔ tidyr 1.0.0.9000 ✔ stringr 1.4.0
## ✔ readr 1.3.1 ✔ forcats 0.4.0
## ── Conflicts ─────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(mgcv) # for GAMs
## Loading required package: nlme
##
## Attaching package: 'nlme'
## The following object is masked from 'package:dplyr':
##
## collapse
## This is mgcv 1.8-31. For overview type 'help("mgcv-package")'.
library(randomForest) # for random forest
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
library("lubridate") # for dates
##
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
##
## date
library(boot) # for invlogit
tck <- read.csv("data_derived/MASTER_all_tck_data_merged.csv")
#### Filter tick dataset and adjust ####
## How many plots were tested, and how many times?
tck %>%
mutate(tested=ifelse(is.na(testingID),"no","yes")) %>%
select(plotID, tested) %>% table(useNA="ifany")
## tested
## plotID no yes
## ABBY_001 70 0
## ABBY_002 63 0
## ABBY_003 63 0
## ABBY_005 67 0
## ABBY_006 63 0
## ABBY_023 68 0
## BARR_021 3 0
## BARR_030 9 0
## BARR_031 9 0
## BARR_034 9 0
## BARR_037 9 0
## BARR_084 9 0
## BART_002 93 0
## BART_010 87 0
## BART_011 87 0
## BART_015 90 0
## BART_019 87 0
## BART_029 66 0
## BLAN_002 180 0
## BLAN_004 123 0
## BLAN_005 2193 2807
## BLAN_008 126 0
## BLAN_012 221 6
## BLAN_015 798 2
## BLAN_020 3 0
## BONA_002 15 0
## BONA_004 12 0
## BONA_012 15 0
## BONA_013 12 0
## BONA_020 12 0
## BONA_044 15 0
## CLBJ_032 87 0
## CLBJ_033 91 0
## CLBJ_034 87 0
## CLBJ_036 81 0
## CLBJ_040 87 0
## CLBJ_043 87 0
## CPER_001 66 0
## CPER_002 66 0
## CPER_003 66 0
## CPER_004 66 0
## CPER_005 66 0
## CPER_007 66 0
## DCFS_001 62 0
## DCFS_003 53 0
## DCFS_004 39 0
## DCFS_005 43 0
## DCFS_007 42 0
## DCFS_010 59 0
## DEJU_001 24 0
## DEJU_003 24 0
## DEJU_009 24 0
## DEJU_014 24 0
## DEJU_015 24 0
## DEJU_044 21 0
## DELA_001 96 0
## DELA_004 21 0
## DELA_005 113 0
## DELA_007 100 0
## DELA_008 117 6
## DELA_011 21 0
## DELA_014 115 0
## DELA_016 94 0
## DSNY_001 90 0
## DSNY_002 9 0
## DSNY_004 87 0
## DSNY_005 84 0
## DSNY_006 90 0
## DSNY_008 89 0
## DSNY_014 6 0
## DSNY_021 69 0
## GRSM_003 84 0
## GRSM_004 84 0
## GRSM_005 42 0
## GRSM_006 84 0
## GRSM_009 84 0
## GRSM_010 90 0
## GRSM_015 42 0
## GUAN_002 45 0
## GUAN_004 12 0
## GUAN_006 48 0
## GUAN_031 45 0
## GUAN_064 33 0
## GUAN_065 48 0
## GUAN_066 48 0
## HARV_001 273 471
## HARV_002 184 0
## HARV_004 3185 352
## HARV_006 771 8
## HARV_020 152 46
## HARV_022 1382 88
## HARV_026 675 12
## HEAL_001 24 0
## HEAL_004 24 0
## HEAL_010 27 0
## HEAL_011 30 0
## HEAL_014 27 0
## HEAL_026 24 0
## JERC_002 96 0
## JERC_004 96 0
## JERC_005 21 0
## JERC_010 93 0
## JERC_011 54 0
## JERC_034 90 0
## JERC_044 87 0
## JORN_001 51 0
## JORN_002 56 0
## JORN_003 54 0
## JORN_004 54 0
## JORN_005 54 0
## JORN_006 51 0
## KONA_002 47 56
## KONA_003 41 48
## KONA_004 42 0
## KONA_019 41 2
## KONA_021 42 0
## KONA_024 39 0
## KONZ_001 116 14
## KONZ_002 178 29
## KONZ_004 154 32
## KONZ_007 115 2
## KONZ_009 116 35
## KONZ_025 7938 3001
## LAJA_001 45 0
## LAJA_002 45 0
## LAJA_003 33 0
## LAJA_004 42 0
## LAJA_005 30 0
## LAJA_030 30 0
## LENO_002 832 6
## LENO_003 744 245
## LENO_004 1206 0
## LENO_015 124 0
## LENO_023 159 24
## LENO_042 102 8
## MLBS_002 24 0
## MLBS_003 24 0
## MLBS_004 19 0
## MLBS_005 24 0
## MLBS_006 18 0
## MLBS_009 24 0
## MOAB_001 60 0
## MOAB_002 60 0
## MOAB_003 57 0
## MOAB_004 60 0
## MOAB_005 60 0
## MOAB_006 60 0
## NIWO_003 27 0
## NIWO_005 27 0
## NIWO_006 27 0
## NIWO_011 27 0
## NIWO_017 27 0
## NIWO_021 27 0
## NOGP_001 75 0
## NOGP_002 76 0
## NOGP_005 72 0
## NOGP_006 72 0
## NOGP_007 72 0
## NOGP_008 72 0
## OAES_003 110 0
## OAES_004 122 0
## OAES_005 110 0
## OAES_007 109 0
## OAES_008 124 0
## OAES_021 133 0
## ONAQ_003 6 0
## ONAQ_004 63 0
## ONAQ_005 63 0
## ONAQ_008 6 0
## ONAQ_009 63 0
## ONAQ_010 6 0
## ONAQ_011 57 0
## ONAQ_016 57 0
## ONAQ_023 58 0
## ORNL_002 5278 8471
## ORNL_003 3429 561
## ORNL_007 10677 3160
## ORNL_008 4779 6088
## ORNL_009 7493 638
## ORNL_040 5148 4196
## OSBS_001 3521 3609
## OSBS_002 231 0
## OSBS_003 2012 7181
## OSBS_004 144 26
## OSBS_005 9193 28824
## OSBS_022 72 1132
## OSBS_048 41 0
## RMNP_002 18 0
## RMNP_003 18 0
## RMNP_004 18 0
## RMNP_007 18 0
## RMNP_012 18 0
## RMNP_013 18 0
## SCBI_002 2696 1036
## SCBI_005 608 60
## SCBI_006 609 16
## SCBI_007 4085 1093
## SCBI_013 1215 4714
## SCBI_039 1951 147
## SERC_001 3300 567
## SERC_002 6861 3164
## SERC_005 1537 1105
## SERC_006 634 874
## SERC_012 1806 565
## SERC_023 125 8
## SJER_001 39 0
## SJER_002 39 0
## SJER_007 39 0
## SJER_008 39 0
## SJER_009 39 0
## SJER_025 39 0
## SOAP_003 12 0
## SOAP_004 12 0
## SOAP_005 12 0
## SOAP_006 12 0
## SOAP_009 12 0
## SOAP_026 12 0
## SRER_002 42 0
## SRER_003 45 0
## SRER_004 39 0
## SRER_006 45 0
## SRER_007 39 0
## SRER_008 42 0
## STEI_001 66 0
## STEI_002 70 0
## STEI_003 69 0
## STEI_005 70 0
## STEI_023 65 0
## STEI_029 81 0
## STER_006 42 0
## STER_026 51 0
## STER_027 54 0
## STER_028 51 0
## STER_029 51 0
## STER_033 54 0
## TALL_001 2703 923
## TALL_002 5996 18767
## TALL_003 951 738
## TALL_006 675 260
## TALL_008 1819 8798
## TALL_016 208 2
## TOOL_009 9 0
## TOOL_028 9 0
## TOOL_031 9 0
## TOOL_032 9 0
## TOOL_035 9 0
## TOOL_071 9 0
## TREE_005 95 0
## TREE_007 376 373
## TREE_015 271 23
## TREE_017 741 712
## TREE_019 1599 1607
## TREE_022 152 188
## UKFS_001 48606 22766
## UKFS_002 8765 315
## UKFS_003 63104 8448
## UKFS_004 36999 6667
## UKFS_018 128 0
## UKFS_030 157 32
## UNDE_002 69 0
## UNDE_003 9 0
## UNDE_011 66 0
## UNDE_013 75 0
## UNDE_017 94 0
## UNDE_019 76 0
## UNDE_999 69 0
## WOOD_001 248 0
## WOOD_002 163 0
## WOOD_006 225 0
## WOOD_007 290 0
## WOOD_009 187 0
## WOOD_024 111 0
## WREF_001 12 0
## WREF_004 12 0
## WREF_007 12 0
## WREF_008 12 0
## WREF_009 12 0
## WREF_013 12 0
## YELL_001 9 0
## YELL_002 9 0
## YELL_003 12 0
## YELL_004 13 0
## YELL_009 12 0
## YELL_012 9 0
## What type of ticks did they test?
tck %>%
mutate(tested=ifelse(is.na(testingID),"no","yes")) %>%
select(lifeStage, tested) %>% table(useNA="ifany")
## tested
## lifeStage no yes
## Adult 9815 0
## Larva 260512 0
## Nymph 13821 155154
## Aggregate by plotID:dayOfYear:year. Count number of different life stages; create proportiontested/positive etc
tck_allsamples_borr <- tck %>%
mutate(tested=ifelse(is.na(Borrelia_sp.),0,1) # Was each tick ever tested for borrellia?
,isPositive=ifelse(Borrelia_sp.=="Positive", 1,0) # And if it was tested, was it positive or negative?
) %>%
group_by(domainID, siteID, nlcdClass, plotID, elevation, collectDate, dayOfYear, year, month, totalSampledArea) %>% # collapse by sample-- summing all tck counts together
summarize(numberTested=sum(tested, na.rm=TRUE) # number of tested ticks in that sample
,n=n() # total ticks in that sample
, numberPositive=sum(isPositive,na.rm = TRUE) # number of positive ticks in that sample
, nAdult=sum(lifeStage=="Adult")
, nNymph=sum(lifeStage=="Nymph")
, nLarva=sum(lifeStage=="Larva")) %>%
ungroup() %>%
mutate(proportionTested=numberTested/nNymph # proportion of all nymph ticks tested-- only nymphs were ever tested.
, proportionPositive=numberPositive/numberTested
, tested = ifelse(numberTested > 0 , TRUE, FALSE) # new true/false tested, which is summed across ticks
, testingStatus = ifelse(numberTested > 0, "Tested", ifelse(nNymph>0, "Nymphs present, not tested", "No nymphs"))
) %>%
mutate(nlcdClass=factor(nlcdClass, levels=c("emergentHerbaceousWetlands","cultivatedCrops","pastureHay","grasslandHerbaceous"
,"dwarfScrub","shrubScrub","sedgeHerbaceous"
,"woodyWetlands","deciduousForest","evergreenForest","mixedForest"))) %>%
mutate(borrPresent = ifelse(numberPositive>0,1,0)
, domainID = factor(as.character(domainID))
, siteID = factor(as.character(siteID))
, plotID = factor(as.character(plotID))
, year = factor(year)
, tckDensity = sum(c(nLarva, nNymph, nAdult))/totalSampledArea
, ntckDensity = nNymph/totalSampledArea
, NLtckDensity = sum(c(nNymph, nAdult), na.rm = TRUE)/totalSampledArea
, atckDensity = nAdult/totalSampledArea) %>%
mutate(lognymphDensity=log(ntckDensity)
, logNLtckDensity=log(NLtckDensity)
, logtckDensity = log(tckDensity)
, logadultDensity=log(atckDensity))
## Plot over time, by year, by plot, to see how zero-inflated it is.
tck_allsamples_borr %>%
ggplot() + geom_point(aes(x=dayOfYear, y=plotID, fill=proportionPositive, col=testingStatus), pch=21) +
scale_fill_gradient(low="white", high="darkred") +
scale_color_manual(values=c(Tested="blue", `Nymphs present, not tested`="grey", `No nymphs`="white")) +
facet_grid(.~year)

## Let's filter out plots that NEVER had any positive borrelia
infectedPlots <- tck %>%
filter(Borrelia_sp.=="Positive") %>%
select(plotID) %>%pull() %>% unique()
## Filtering same data as above; only now removing non-infected plots.
tck_borrelia_positivePlots <- tck_allsamples_borr %>%
filter(plotID %in% as.character(infectedPlots))
## Re-assess the apperance of plots
tck_borrelia_positivePlots %>%
ggplot() + geom_point(aes(x=dayOfYear, y=plotID, fill=proportionPositive, col=testingStatus), pch=21) +
scale_fill_gradient(low="white", high="darkred") +
scale_color_manual(values=c(Tested="blue", `Nymphs present, not tested`="grey", `No nymphs`="white")) +
facet_grid(.~year)

## There are STILL a lot of zeros-- zeros where there are nymphs but they were never tested.
# Let's remove those.
tck_borrelia <- tck_borrelia_positivePlots %>%
mutate(numberPositive=ifelse(numberTested==0,NA,numberPositive)) %>% # Make sure that numberPositive is not artificially zero-- if there were no tests, it should be NA
filter(numberTested!=0) # get rid of all samples where the didn't actually test.
tck_borrelia %>%
ggplot() + geom_point(aes(x=dayOfYear, y=plotID, fill=proportionPositive, col=testingStatus), pch=21) +
scale_fill_gradient(low="white", high="darkred") +
scale_color_manual(values=c(Tested="blue", `Nymphs present, not tested`="grey", `No nymphs`="white")) +
facet_grid(.~year)

nrow(tck_borrelia)
## [1] 311
# There are only 311 samples left-- still zero inflated, but slightly better.
#### Preliminary Plotting ####
# First, what is the distribution of Borrelia prevalence?
tck_borrelia %>%
ggplot() +geom_histogram(aes(x=log(numberPositive+1)), bins=100)

# histogram without any zeros
tck_borrelia %>%
filter(numberPositive>0) %>%
ggplot() +geom_histogram(aes(x=(numberPositive)), bins=100)

# histogram log abundance, no zeros
tck_borrelia %>%
filter(numberPositive>0) %>%
ggplot() +geom_histogram(aes(x=log(numberPositive)), bins=50)

# What is relationship between proportion positive and number positive
tck_borrelia %>%
ggplot() +geom_point(aes(x=log(numberPositive), y=log(proportionPositive)))

# Now, what is the relationship between number tested and nNymph? Might be a binning artifact?
tck_borrelia %>%
ggplot() +geom_point(aes(x=log(nNymph), y=log(numberTested)))

# Just get rid of all data where they didn't test most nymphs-- see how many that is
tck_borrelia %>%
filter(abs(nNymph-numberTested)>0) %>% select(plotID, collectDate, nNymph, numberTested)
## # A tibble: 36 x 4
## plotID collectDate nNymph numberTested
## <fct> <fct> <int> <dbl>
## 1 HARV_004 2015-07-13T18:22Z 1451 2
## 2 BLAN_005 2016-05-16T15:37Z 577 575
## 3 BLAN_005 2017-06-01T17:59Z 1222 1221
## 4 BLAN_005 2017-06-19T14:51Z 158 156
## 5 SCBI_002 2016-05-31T14:40Z 241 240
## 6 SCBI_002 2017-07-03T16:34Z 7 6
## 7 SCBI_007 2017-05-31T17:32Z 13 12
## 8 SCBI_013 2017-07-26T17:29Z 483 482
## 9 SERC_001 2017-06-28T15:00Z 31 30
## 10 SERC_001 2017-07-17T20:38Z 7 6
## # … with 26 more rows
# Most of these samples are not a problem, except HARV_004 (2015) and maybe SCI_002 (2017) and SERC_001 (2017-07).
# Let's get rid of only HARV_004; I think all the rest are fine.
tck_borrelia_adj <- tck_borrelia %>%
filter(abs(nNymph-numberTested)<3) # HARV_004 is the only site that differs nNymph and numberTested by more than 3
## Inspecting possible predictors of borrelia
# dayOfYear vs borrelia
tck_borrelia_adj %>%
ggplot() +geom_point(aes(x=dayOfYear, y=numberPositive))+
geom_point(aes(x=dayOfYear, y=proportionPositive*max(tck_borrelia_adj$numberPositive)), col="red", alpha=0.75) +
scale_y_continuous(sec.axis=sec_axis(trans=~./max(tck_borrelia_adj$numberPositive), name="Proportion positive"))+
theme(axis.title.y.right = element_text(color="red")) + ylab("Number positive")

# elevation vs borrelia
tck_borrelia_adj %>%
ggplot() +geom_point(aes(x=elevation, y=proportionPositive, col=domainID))

# nymph density vs borrelia
tck_borrelia_adj %>%
ggplot() +geom_point(aes(x=lognymphDensity, y=proportionPositive)) +
geom_point(aes(x=lognymphDensity, y=borrPresent), col="red")

# adult density vs borrelia
tck_borrelia_adj %>%
ggplot() +geom_point(aes(x=logadultDensity, y=proportionPositive)) +
geom_point(aes(x=logadultDensity, y=borrPresent), col="red")

# nymph and adult density vs borrelia-- first, are they correlated?
tck_borrelia_adj %>%
ggplot() + geom_density2d(aes(x=logadultDensity, y=lognymphDensity))

tck_borrelia_adj %>%
ggplot() +geom_point(aes(x=logNLtckDensity, y=proportionPositive)) +
geom_point(aes(x=logNLtckDensity, y=borrPresent), col="red")

#### Fitting a two-step GAM ####
stepGAM <- function(dat, predictors, response, constant_pred=NULL, family, ignore.combos) {
## For troubleshooting
# dat <- tck_borrelia_adj
# predictors = allPred
# response = "borrPresent"
# constant_pred <- "offset(log(numberTested))"
# family=binomial
# ignore.combos=list(c("domainID","s(domainID, bs='re')"), c("s(logNLtckDensity)","s(logadultDensity)","s(lognymphDensity)")
# , c("s(logNLtckDensity)","s(logadultDensity)"),c("s(logNLtckDensity)","s(lognymphDensity)"))
n_rows <- sum(sapply(1:length(predictors),FUN=function(x){ncol(combn(predictors,m=x))} ))
allAIC <- setNames(data.frame(matrix(nrow=n_rows, ncol=(4+length(predictors)))),nm = c("AIC","REML","Dev.expl","formula",predictors))
i <- 1
pb <- txtProgressBar(title = "progress bar", min = 0,
max = n_rows, style=3)
for ( p in 1:length(predictors) ) {
combos <- combn(predictors, m=p)
for ( c in 1:ncol(combos) ) {
# skip any combinations that are meant to be ignored
if (any(sapply(ignore.combos, FUN=function(ic){sum(ic %in% combos[,c]) == length(ic)}))) {
i = i + 1
next
}
if ( length(constant_pred)>0 ) {
frml.temp <- paste(c(paste0(response," ~ ",constant_pred),combos[,c]), collapse="+")
} else {
frml.temp <- paste0(paste0(response," ~ "),paste(combos[,c], collapse=" + "), sep="")
}
# If more coefficients than data
res <- try(gam(as.formula(frml.temp)
, data=dat
, method="REML"
, family=family))
if ( inherits(res, "try-error") ) {
next
} else {
gam.temp <- gam(as.formula(frml.temp)
, data=dat
, method="REML"
, family=family)
pred.temp <- (colnames(allAIC) %in% combos[,c])[-c(1,2,3,4)]
allAIC[i,c("AIC","REML","Dev.expl","formula")] <- as.vector(c(gam.temp$aic,gam.temp$gcv.ubre[1],summary(gam.temp)$dev.expl,frml.temp))
allAIC[i, predictors] <- pred.temp
}
i <- i+1
Sys.sleep(0.1)
setTxtProgressBar(pb, i, label=paste( round(i/total*100, 0),
"% done"))
}
}
close(pb)
return(allAIC)
}
#### Subsetting data into training and testing sets ####
# We want to subset into multiple subsets over time. How far apart are sampling days?
tck_borrelia_adj %>%
filter(year==2016, plotID=="OSBS_005") %>%
arrange(dayOfYear) %>% select(dayOfYear) %>%pull() %>%diff()
## [1] 20 21 22 42 19 43 21
# Around every 20-25 days, on average
## Get only 2014-2016 data for training
tck_borrelia_train <- tck_borrelia_adj %>% filter(year!=2017)
# Filter so test dataset so we only have those in training
inclPlots <- as.character(unique(tck_borrelia_train$plotID))
# make testing set with same plots
tck_borrelia_test <- tck_borrelia_adj %>% filter(year==2017, plotID %in% inclPlots)
## Subsetting multiple time subsets
subsetData <- list()
daysIn2017 <- tck_borrelia_adj %>% filter(year==2017, plotID %in% c(inclPlots)) %>% select(dayOfYear) %>% pull() %>% unique() %>% sort()
for ( doy in daysIn2017 ) {
subsetData[[paste(doy)]] <- list()
subsetData[[paste(doy)]][['train']] <- tck_borrelia_adj %>% filter(!(year==2017 & dayOfYear>=doy), plotID %in% inclPlots) %>% mutate(year=factor(year))
subsetData[[paste(doy)]][['test']] <- tck_borrelia_adj %>% filter((year==2017 & dayOfYear>=doy), plotID %in% inclPlots)%>% mutate(year=factor(year))
}
#### Part I: binomial hurdle component ####
# First, model a binomial component using only 2017 data
# Plot is a random effect, which I include as a varying-intercept, varying-slope model.
allPred <- c("s(dayOfYear)","nlcdClass","s(elevation)"
,"s(logNLtckDensity)","s(logadultDensity)","s(lognymphDensity)"
,"s(plotID, bs='re')", "s(plotID, dayOfYear, bs='re')"
,"s(year, bs='re')", "s(year, dayOfYear, bs='re')"
, "domainID", "s(domainID, bs='re')"
)
if (FALSE) {
allAIC <- stepGAM(dat = tck_borrelia_train, predictors = allPred, response = "borrPresent", constant_pred = "offset(log(numberTested))", family = binomial, ignore.combos=list(c("domainID","s(domainID, bs='re')"), c("s(logNLtckDensity)","s(logadultDensity)","s(lognymphDensity)") , c("s(logNLtckDensity)","s(logadultDensity)"),c("s(logNLtckDensity)","s(lognymphDensity)")))
allAIC_filt <- allAIC %>% filter(!is.na(AIC)) %>% arrange(AIC) %>%mutate(rank=seq(1:length(AIC))) %>%
mutate(AIC=as.numeric(AIC)
, Dev.expl = as.numeric(Dev.expl)
, REML = as.numeric(REML)) %>%
filter(AIC<500)
save(allAIC, file="allAIC.RData")
save(allAIC_filt, file="allAIC_filt.RData")
} else {
load("allAIC.RData")
load("allAIC_filt.RData")
}
# allAIC %>%
# as_tibble() %>%
# arrange(AIC) %>% View()
nrow(allAIC)
## [1] 4095
# Let's see distribution of AIC values
allAIC_filt %>%
ggplot() +geom_line(aes(x=rank, y=AIC))+
geom_line(aes(x=rank, y=Dev.expl*(max(allAIC_filt$AIC))), col="red", alpha=0.2) + scale_y_continuous(sec.axis=sec_axis(~./(max(allAIC_filt$AIC)), name="Deviance Explained")) +
theme(axis.title.y.right = element_text(colour = "red"))

# Compare AIC for models with and without each predictor
allAIC_filt %>% gather(-c(AIC, REML, Dev.expl, formula,rank), key=Predictor, value=WithPredictor) %>%
select(AIC, REML, Dev.expl, Predictor, WithPredictor) %>%
mutate(WithPredictor = ifelse(WithPredictor==0, FALSE, TRUE)) %>%
ggplot() + geom_violin(aes(x=WithPredictor, y=AIC)) + facet_wrap(.~Predictor)

# Compare deviance explained for models with and without each predictor
allAIC_filt %>% gather(-c(AIC, REML, Dev.expl, formula,rank), key=Predictor, value=WithPredictor) %>%
select(AIC, REML, Dev.expl, Predictor, WithPredictor) %>%
mutate(WithPredictor = ifelse(WithPredictor==0, FALSE, TRUE)) %>%
ggplot() + geom_violin(aes(x=WithPredictor, y=Dev.expl)) + facet_wrap(.~Predictor)

frml_bin1_bestAIC <- allAIC_filt[allAIC_filt$AIC==min(allAIC_filt$AIC),"formula"]
frml_bin1_bestDevexpl <- allAIC_filt[allAIC_filt$Dev.expl==max(allAIC_filt$Dev.expl),"formula"]
# Look at the model with the best AIC value
mod.gambin_bestAIC <- gam(as.formula(frml_bin1_bestAIC)
# , offset=log(numberTested) # I include the offset in the formula instead, so that it is included in predictions. Including it here does NOT incorporate numberTested in predictiosn.
, data=tck_borrelia_train
, method="REML"
, family=binomial)
summary(mod.gambin_bestAIC)
##
## Family: binomial
## Link function: logit
##
## Formula:
## borrPresent ~ offset(log(numberTested)) + nlcdClass + s(logadultDensity) +
## s(year, bs = "re") + s(domainID, bs = "re")
##
## Parametric coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -5.768e+00 2.083e+00 -2.769 0.00562 **
## nlcdClassdeciduousForest -7.696e-01 1.943e+00 -0.396 0.69203
## nlcdClassevergreenForest 2.806e+00 1.638e+00 1.713 0.08669 .
## nlcdClassmixedForest 5.261e+01 2.373e+07 0.000 1.00000
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df Chi.sq p-value
## s(logadultDensity) 4.149 5.049 10.58 0.0621 .
## s(year) 1.462 2.000 19.95 0.0816 .
## s(domainID) 5.372 6.000 102.12 8.22e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.54 Deviance explained = 68.6%
## -REML = 51.939 Scale est. = 1 n = 176
gam.check(mod.gambin_bestAIC)

##
## Method: REML Optimizer: outer newton
## full convergence after 9 iterations.
## Gradient range [-2.658227e-08,2.705552e-09]
## (score 51.939 & scale 1).
## Hessian positive definite, eigenvalue range [0.2856262,2.501879].
## Model rank = 23 / 23
##
## Basis dimension (k) checking results. Low p-value (k-index<1) may
## indicate that k is too low, especially if edf is close to k'.
##
## k' edf k-index p-value
## s(logadultDensity) 9.00 4.15 0.89 0.095 .
## s(year) 3.00 1.46 NA NA
## s(domainID) 7.00 5.37 NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
plot(mod.gambin_bestAIC, pages=1)

# Look at the model with the best deviance explained
mod.gambin_bestDevExpl <- gam(as.formula(frml_bin1_bestDevexpl)
# , offset=log(numberTested) # I include the offset in the formula instead, so that it is included in predictions. Including it here does NOT incorporate numberTested in predictiosn.
, data=tck_borrelia_train
, method="REML"
, family=binomial)
summary(mod.gambin_bestDevExpl)
##
## Family: binomial
## Link function: logit
##
## Formula:
## borrPresent ~ offset(log(numberTested)) + s(logadultDensity) +
## s(plotID, bs = "re") + s(plotID, dayOfYear, bs = "re") +
## s(year, bs = "re") + s(year, dayOfYear, bs = "re")
##
## Parametric coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.9028 0.9139 -5.365 8.1e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df Chi.sq p-value
## s(logadultDensity) 3.206e+00 3.881 4.797 0.254
## s(plotID) 6.472e+00 27.000 81.313 0.622
## s(plotID,dayOfYear) 1.752e+01 28.000 172.997 0.219
## s(year) 8.619e-05 2.000 0.000 0.763
## s(year,dayOfYear) 1.551e+00 3.000 27.936 0.117
##
## R-sq.(adj) = 0.584 Deviance explained = 73.1%
## -REML = 88.371 Scale est. = 1 n = 176
gam.check(mod.gambin_bestDevExpl)

##
## Method: REML Optimizer: outer newton
## full convergence after 8 iterations.
## Gradient range [-3.141881e-05,-1.537113e-06]
## (score 88.37141 & scale 1).
## Hessian positive definite, eigenvalue range [3.14177e-05,3.677544].
## Model rank = 72 / 72
##
## Basis dimension (k) checking results. Low p-value (k-index<1) may
## indicate that k is too low, especially if edf is close to k'.
##
## k' edf k-index p-value
## s(logadultDensity) 9.00e+00 3.21e+00 0.93 0.2
## s(plotID) 2.80e+01 6.47e+00 NA NA
## s(plotID,dayOfYear) 2.80e+01 1.75e+01 NA NA
## s(year) 3.00e+00 8.62e-05 NA NA
## s(year,dayOfYear) 3.00e+00 1.55e+00 NA NA
plot(mod.gambin_bestDevExpl, pages=1)

# Compare accuracy rate
mean((predict(mod.gambin_bestAIC, type = "response")>0.5) == tck_borrelia_train$borrPresent)
## [1] 0.875
mean((predict(mod.gambin_bestDevExpl, type = "response")>0.5) == tck_borrelia_train$borrPresent)
## [1] 0.9034091
# Honestly, the "deviance explained" looks better and makes more sense, in my opinion.
## FINAL MODEL:
frml.bin1 <- frml_bin1_bestDevexpl
mod.gambin<- gam(as.formula(frml.bin1)
# , offset=log(numberTested) # I include the offset in the formula instead, so that it is included in predictions. Including it here does NOT incorporate numberTested in predictiosn.
, data=tck_borrelia_train
, method="REML"
, family=binomial)
gam.check(mod.gambin)

##
## Method: REML Optimizer: outer newton
## full convergence after 8 iterations.
## Gradient range [-3.141881e-05,-1.537113e-06]
## (score 88.37141 & scale 1).
## Hessian positive definite, eigenvalue range [3.14177e-05,3.677544].
## Model rank = 72 / 72
##
## Basis dimension (k) checking results. Low p-value (k-index<1) may
## indicate that k is too low, especially if edf is close to k'.
##
## k' edf k-index p-value
## s(logadultDensity) 9.00e+00 3.21e+00 0.93 0.16
## s(plotID) 2.80e+01 6.47e+00 NA NA
## s(plotID,dayOfYear) 2.80e+01 1.75e+01 NA NA
## s(year) 3.00e+00 8.62e-05 NA NA
## s(year,dayOfYear) 3.00e+00 1.55e+00 NA NA
summary(mod.gambin)
##
## Family: binomial
## Link function: logit
##
## Formula:
## borrPresent ~ offset(log(numberTested)) + s(logadultDensity) +
## s(plotID, bs = "re") + s(plotID, dayOfYear, bs = "re") +
## s(year, bs = "re") + s(year, dayOfYear, bs = "re")
##
## Parametric coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.9028 0.9139 -5.365 8.1e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df Chi.sq p-value
## s(logadultDensity) 3.206e+00 3.881 4.797 0.254
## s(plotID) 6.472e+00 27.000 81.313 0.622
## s(plotID,dayOfYear) 1.752e+01 28.000 172.997 0.219
## s(year) 8.619e-05 2.000 0.000 0.763
## s(year,dayOfYear) 1.551e+00 3.000 27.936 0.117
##
## R-sq.(adj) = 0.584 Deviance explained = 73.1%
## -REML = 88.371 Scale est. = 1 n = 176
plot(mod.gambin, scale=0, pages=1)

vis.gam(mod.gambin, view = c("logadultDensity","dayOfYear"), theta=45)

vis.gam(mod.gambin, view = c("year","dayOfYear"), theta=45)
## Warning in predict.gam(x, newdata = newd, se.fit = TRUE, type = type): factor
## levels 2017, 2018, 2019 not in original fit

vis.gam(mod.gambin, view = c("plotID","dayOfYear"), theta=45)
## Warning in predict.gam(x, newdata = newd, se.fit = TRUE, type = type): factor
## levels ABBY_001, ABBY_002, ABBY_003, ABBY_005, ABBY_006, ABBY_023, BARR_021,
## BARR_030, BARR_031, BARR_034, BARR_037, BARR_084, BART_002, BART_010, BART_011,
## BART_015, BART_019, BART_029, BLAN_002, BLAN_004, BLAN_008, BLAN_012, BLAN_015,
## BLAN_020, BONA_002, BONA_004, BONA_012, BONA_013, BONA_020 not in original fit

# Proportion of correct predictions
mean((predict(mod.gambin, type = "response")>0.5) == tck_borrelia_train$borrPresent)
## [1] 0.9034091
# Of those where they predicted presence/absence incorrectly, how "far" were they off?
# (i.e. were there are "negative" predictions that actually had a LOT of borrellia?)
tck_borrelia_train %>%
mutate(pred =as.numeric(predict(mod.gambin, type="response"))
,correctPredBin = ((pred>0.5) == tck_borrelia_train$borrPresent) ) %>%
ggplot() +geom_jitter(aes(x=log(numberPositive+1), y=correctPredBin), height=0.25, width=0) + ylab("Correctly predicted")

tck_borrelia_train %>%
mutate(pred =as.numeric(plogis(predict(mod.gambin)))
,correctPredBin =((pred>0.5) == tck_borrelia_train$borrPresent) ) %>%
ggplot() +geom_point(aes(x=log(numberPositive+1), y=pred, col=correctPredBin))

# Ideally, you have a positive, linear correlation between proportion positive and predicted probability. "Errors" should be correlated with small sample size.
tck_borrelia_train %>%
mutate(pred =as.numeric(plogis(predict(mod.gambin)))
,correctPredBin =((pred>0.5) == tck_borrelia_train$borrPresent) ) %>%
ggplot() +geom_point(aes(x=proportionPositive, y=pred, col=correctPredBin, cex=log(numberTested)))

# What are the groups that they least accurately predicted?
tck_borrelia_train %>%
mutate(predictedProbability =as.numeric(plogis(predict(mod.gambin)))
,correctPredBin =((predictedProbability>0.5) == tck_borrelia_train$borrPresent)
# , proportionPositive = ifelse(proportionPositive==0, NA, proportionPositive)
) %>%
ggplot() + geom_point(aes(x=dayOfYear, y=plotID, fill=proportionPositive, col=predictedProbability), pch=21,cex=3) +
scale_fill_gradient(low="white", high="darkred") +
scale_color_gradient(low="white", high="darkred") +
facet_grid(.~year)

# Is there a bias for predicting negative or positive results?
tck_borrelia_train %>%
mutate(predictedProbability =as.numeric(plogis(predict(mod.gambin)))
, predictedPA = (predictedProbability>0.5)
,correctPredBin =((predictedProbability>0.5) == tck_borrelia_train$borrPresent) ) %>%
select(borrPresent,predictedPA) %>% table()
## predictedPA
## borrPresent FALSE TRUE
## 0 115 4
## 1 13 44
4/(115+4) # False positive rate
## [1] 0.03361345
13/(13+44) # False negative rate
## [1] 0.2280702
# The false negative rate is actually a lot higher than the false positive rate.
# This means, on average, more samples are positive then you'd expect, given the model.
## So finally, we need to filter the dataset by the predicted "absent" and fit a poisson or negative binomial model
# To include the maximum amount of samples as possible, I'm going to filter the dataset sort of un-usually--
# First, I am going to keep all positive borrelia results. Then, for each "negative" borrelia result,
# I will look at the predicted probability in our model and determine whether it is a "binomial/hurdle" negative, or a "poisson/NB" negative.
tck_borrelia_filtBin <- tck_borrelia_train %>%
mutate(pred=predict(mod.gambin, type="response")) %>%
filter((borrPresent>0 | pred > 0.5))
# Double check I did the filtering correctly
tck_borrelia_filtBin %>%
mutate(predictedPA = (pred > 0.5)) %>%
select(borrPresent,predictedPA) %>% table()
## predictedPA
## borrPresent FALSE TRUE
## 0 0 4
## 1 13 44
# make year a factor
tck_borrelia_filtBin <- tck_borrelia_filtBin %>%
mutate(year=factor(year))
#### Part II: the second GAM model for abundance ####
### Try the whole model section process again with a binomial
# I'm increasing spline smoothness here, because the AIC values change drastically when I don't include them
allPred2 <- c("s(dayOfYear, sp=1)","nlcdClass","s(elevation)"
,"s(logNLtckDensity, sp=1)","s(logadultDensity, sp=1)","s(lognymphDensity, sp=1)"
,"s(plotID, bs='re')", "s(plotID, dayOfYear, bs='re')"
,"s(year, bs='re')", "s(year, dayOfYear, bs='re')"
, "domainID", "s(domainID, bs='re')"
)
ignoreCombos <- list(c("domainID","s(domainID, bs='re')")
, c("s(logNLtckDensity, sp=1)","s(logadultDensity, sp=1)","s(lognymphDensity, sp=1)")
, c("s(logNLtckDensity, sp=1)","s(lognymphDensity, sp=1)")
, c("s(logNLtckDensity, sp=1)","s(logadultDensity, sp=1)")
)
nrow(tck_borrelia_filtBin)
## [1] 61
if ( FALSE ) {
allAIC_2 <- stepGAM(dat=tck_borrelia_filtBin, predictors = allPred2, response = "cbind(numberPositive,numberTested)", family = binomial, ignore.combos=ignoreCombos)
allAIC_2_filt <- allAIC_2 %>% filter(!is.na(AIC)) %>%
mutate(AIC=as.numeric(AIC)
, Dev.expl = as.numeric(Dev.expl)
, REML = as.numeric(REML)) %>%
arrange(AIC) %>%mutate(rank=seq(1:n())) %>%
filter(AIC<2000)
save(allAIC_2, file="allAIC_2.RData")
save(allAIC_2_filt, file="allAIC_2_filt.RData")
} else {
load("allAIC_2.RData")
load("allAIC_2_filt.RData")
}
summary(allAIC_2)
## AIC REML Dev.expl formula
## Length:4095 Length:4095 Length:4095 Length:4095
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
## s(dayOfYear, sp=1) nlcdClass s(elevation) s(logNLtckDensity, sp=1)
## Mode :logical Mode :logical Mode :logical Mode :logical
## FALSE:698 FALSE:641 FALSE:698 FALSE:992
## TRUE :552 TRUE :609 TRUE :552 TRUE :258
## NA's :2845 NA's :2845 NA's :2845 NA's :2845
## s(logadultDensity, sp=1) s(lognymphDensity, sp=1) s(plotID, bs='re')
## Mode :logical Mode :logical Mode :logical
## FALSE:812 FALSE:812 FALSE:860
## TRUE :438 TRUE :438 TRUE :390
## NA's :2845 NA's :2845 NA's :2845
## s(plotID, dayOfYear, bs='re') s(year, bs='re') s(year, dayOfYear, bs='re')
## Mode :logical Mode :logical Mode :logical
## FALSE:860 FALSE:641 FALSE:641
## TRUE :390 TRUE :609 TRUE :609
## NA's :2845 NA's :2845 NA's :2845
## domainID s(domainID, bs='re')
## Mode :logical Mode :logical
## FALSE:855 FALSE:855
## TRUE :395 TRUE :395
## NA's :2845 NA's :2845
# allAIC_2%>%
# arrange(AIC) %>%View()
#
# # Let's see distribution of AIC values
# allAIC_2 %>%
# filter(!is.na(AIC), !is.na(Dev.expl)) %>%
# arrange(AIC) %>% mutate(rank=seq(1,n())) %>%
# ggplot() +geom_line(aes(x=rank, y=AIC))+
# geom_line(aes(x=rank, y=Dev.expl*(max(allAIC_2$AIC))), col="red", alpha=0.2) + scale_y_continuous(sec.axis=sec_axis(~./(max(allAIC_2$AIC)), name="Deviance Explained")) +
# theme(axis.title.y.right = element_text(colour = "red"))
#
allAIC_2_filt %>%
ggplot() +geom_line(aes(x=rank, y=AIC))+
geom_line(aes(x=rank, y=Dev.expl*(max(allAIC_2_filt$AIC))), col="red", alpha=0.2) + scale_y_continuous(sec.axis=sec_axis(~./(max(allAIC_2_filt$AIC)), name="Deviance Explained")) +
theme(axis.title.y.right = element_text(colour = "red"))

# Compare AIC for models with and without each predictor
allAIC_2_filt %>% gather(-c(AIC, REML, Dev.expl, formula, rank), key=Predictor, value=WithPredictor) %>%
select(AIC, REML, Dev.expl, Predictor, WithPredictor) %>%
mutate(WithPredictor = ifelse(WithPredictor==0, FALSE, TRUE)) %>%
ggplot() + geom_violin(aes(x=WithPredictor, y=AIC)) + facet_wrap(.~Predictor)

# Compare deviance explained for models with and without each predictor
allAIC_2_filt %>% gather(-c(AIC, REML, Dev.expl, formula,rank), key=Predictor, value=WithPredictor) %>%
select(AIC, REML, Dev.expl, Predictor, WithPredictor) %>%
mutate(WithPredictor = ifelse(WithPredictor==0, FALSE, TRUE)) %>%
ggplot() + geom_violin(aes(x=WithPredictor, y=Dev.expl)) + facet_wrap(.~Predictor)

frml_bin2_bestAIC <- allAIC_2_filt[allAIC_2_filt$AIC==min(allAIC_2_filt$AIC),"formula"]
frml_bin2_bestDevexpl <- allAIC_2_filt[allAIC_2_filt$Dev.expl==max(allAIC_2_filt$Dev.expl),"formula"]
# frml_bin2_bestAIC_adjdayOfYear <- "cbind(numberPositive,numberTested) ~ s(dayOfYear, sp=1) + nlcdClass + s(logNLtckDensity, sp=1) + s(plotID, bs='re') + s(plotID, dayOfYear, bs='re') + s(year, bs='re') + s(year, dayOfYear, bs='re') + domainID"
# they're the same
## Best AIC model
mod.gambin2_bestAIC <- gam(as.formula(frml_bin2_bestAIC)
, data=tck_borrelia_filtBin
, method="REML"
, family=binomial)
summary(mod.gambin2_bestAIC)
##
## Family: binomial
## Link function: logit
##
## Formula:
## cbind(numberPositive, numberTested) ~ s(dayOfYear, sp = 1) +
## s(logadultDensity, sp = 1) + s(lognymphDensity, sp = 1) +
## s(plotID, bs = "re") + s(year, dayOfYear, bs = "re")
##
## Parametric coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.9956 0.2245 -8.888 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df Chi.sq p-value
## s(dayOfYear) 3.413 4.119 25.47 0.000184 ***
## s(logadultDensity) 3.372 3.758 44.58 1.11e-07 ***
## s(lognymphDensity) 3.020 3.672 80.15 3.80e-16 ***
## s(plotID) 20.484 25.000 293.13 2.59e-15 ***
## s(year,dayOfYear) 1.330 2.000 216.82 0.002659 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.823 Deviance explained = 90.5%
## -REML = 269.44 Scale est. = 1 n = 61
plot(mod.gambin2_bestAIC, pages=1)

gam.check(mod.gambin2_bestAIC)

##
## Method: REML Optimizer: outer newton
## full convergence after 5 iterations.
## Gradient range [-1.004244e-09,3.619618e-09]
## (score 269.4377 & scale 1).
## Hessian positive definite, eigenvalue range [0.542399,8.018056].
## Model rank = 57 / 57
##
## Basis dimension (k) checking results. Low p-value (k-index<1) may
## indicate that k is too low, especially if edf is close to k'.
##
## k' edf k-index p-value
## s(dayOfYear) 9.00 3.41 1.20 0.95
## s(logadultDensity) 9.00 3.37 1.12 0.88
## s(lognymphDensity) 9.00 3.02 1.27 0.98
## s(plotID) 26.00 20.48 NA NA
## s(year,dayOfYear) 3.00 1.33 NA NA
## Best Dev expl model
mod.gambin2_bestdevexpl <- gam(as.formula(frml_bin2_bestDevexpl)
, data=tck_borrelia_filtBin
, method="REML"
, family=binomial)
summary(mod.gambin2_bestdevexpl)
##
## Family: binomial
## Link function: logit
##
## Formula:
## cbind(numberPositive, numberTested) ~ nlcdClass + s(elevation) +
## s(logNLtckDensity, sp = 1) + s(plotID, dayOfYear, bs = "re") +
## domainID
##
## Parametric coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.04672 9.09647 -0.005 0.996
## nlcdClassdeciduousForest -0.05289 7.63411 -0.007 0.994
## nlcdClassevergreenForest 3.11336 7.53570 0.413 0.679
## nlcdClassmixedForest -3.32665 7.46055 -0.446 0.656
## domainIDD02 1.63473 3.60013 0.454 0.650
## domainIDD03 -0.73083 10.88858 -0.067 0.946
## domainIDD05 -11.68637 8.51298 -1.373 0.170
## domainIDD06 -0.69852 3.89949 -0.179 0.858
## domainIDD07 -5.45374 5.01527 -1.087 0.277
## domainIDD08 -10.95766 10.90795 -1.005 0.315
##
## Approximate significance of smooth terms:
## edf Ref.df Chi.sq p-value
## s(elevation) 5.923 6.211 53.36 9.81e-10 ***
## s(logNLtckDensity) 3.255 3.881 76.73 2.19e-15 ***
## s(plotID,dayOfYear) 19.882 24.000 207.78 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.746 Deviance explained = 90.8%
## -REML = 292.66 Scale est. = 1 n = 61
plot(mod.gambin2_bestdevexpl, pages=1)

gam.check(mod.gambin2_bestdevexpl)

##
## Method: REML Optimizer: outer newton
## full convergence after 6 iterations.
## Gradient range [-9.354166e-08,4.959567e-07]
## (score 292.655 & scale 1).
## Hessian positive definite, eigenvalue range [1.90978,6.107993].
## Model rank = 54 / 54
##
## Basis dimension (k) checking results. Low p-value (k-index<1) may
## indicate that k is too low, especially if edf is close to k'.
##
## k' edf k-index p-value
## s(elevation) 9.00 5.92 1.40 1.00
## s(logNLtckDensity) 9.00 3.26 1.31 0.99
## s(plotID,dayOfYear) 26.00 19.88 NA NA
### BEST MODEL:
frml.bin2 <- frml_bin2_bestAIC
mod.gambin_2 <- gam(as.formula(frml.bin2)
, data=tck_borrelia_filtBin
, method="REML"
, family=binomial)
summary(mod.gambin_2)
##
## Family: binomial
## Link function: logit
##
## Formula:
## cbind(numberPositive, numberTested) ~ s(dayOfYear, sp = 1) +
## s(logadultDensity, sp = 1) + s(lognymphDensity, sp = 1) +
## s(plotID, bs = "re") + s(year, dayOfYear, bs = "re")
##
## Parametric coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.9956 0.2245 -8.888 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df Chi.sq p-value
## s(dayOfYear) 3.413 4.119 25.47 0.000184 ***
## s(logadultDensity) 3.372 3.758 44.58 1.11e-07 ***
## s(lognymphDensity) 3.020 3.672 80.15 3.80e-16 ***
## s(plotID) 20.484 25.000 293.13 2.59e-15 ***
## s(year,dayOfYear) 1.330 2.000 216.82 0.002659 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.823 Deviance explained = 90.5%
## -REML = 269.44 Scale est. = 1 n = 61
plot(mod.gambin_2, pages=1)

gam.check(mod.gambin_2)

##
## Method: REML Optimizer: outer newton
## full convergence after 5 iterations.
## Gradient range [-1.004244e-09,3.619618e-09]
## (score 269.4377 & scale 1).
## Hessian positive definite, eigenvalue range [0.542399,8.018056].
## Model rank = 57 / 57
##
## Basis dimension (k) checking results. Low p-value (k-index<1) may
## indicate that k is too low, especially if edf is close to k'.
##
## k' edf k-index p-value
## s(dayOfYear) 9.00 3.41 1.20 0.95
## s(logadultDensity) 9.00 3.37 1.12 0.82
## s(lognymphDensity) 9.00 3.02 1.27 0.99
## s(plotID) 26.00 20.48 NA NA
## s(year,dayOfYear) 3.00 1.33 NA NA
vis.gam(mod.gambin_2, view = c("dayOfYear","logadultDensity"), theta=45)

vis.gam(mod.gambin_2, view = c("dayOfYear","lognymphDensity"), theta=45)

vis.gam(mod.gambin_2, view = c("year","dayOfYear"), theta=-45)

# Residuals
plot(mod.gambin_2$residuals ~ mod.gambin_2$fitted.values, ylab="Residuals",xlab="Fitted values")

#### Part III: Trying to predict 2017 #####
predictions_over_doy_unprocessed <- data.frame(matrix(ncol=35, dimnames = list(NULL, c("pred_type","pred_bin","pred_bin2","se","se2","doy_cutoff",colnames(tck_borrelia_test)))))
pb <- txtProgressBar(title = "progress bar", min = 0,
max = length(daysIn2017), style=3)
##
|
| | 0%
## Should adjust model to NOT include an interac
# frml.bin1 <- "borrPresent ~ offset(log(numberTested))+s(logadultDensity)+s(plotID, bs='re')+s(plotID, dayOfYear, bs='re')+s(year, bs='re')"
# frml.bin2 <- "cbind(numberPositive,numberTested) ~ s(dayOfYear, sp=1) + s(logadultDensity, sp=1) + s(lognymphDensity, sp=1) + s(plotID, bs='re') + s(year, bs='re')"
for ( i in 1:length(daysIn2017) ) {
doy <- daysIn2017[i]
tempTrain <- subsetData[[paste0(doy)]][['train']]
tempTrain2017 <- tempTrain %>% filter(year==2017) %>% arrange(dayOfYear)
tempTest <- subsetData[[paste0(doy)]][['test']]
### Part I: Presence/absence
tempFit <- gam(as.formula(frml.bin1), data=tempTrain, family = binomial(), method = "REML")
tempNewPredict <- predict(tempFit, newdata = tempTest, type = "link", se.fit = TRUE)
tempOldPredict <- predict(tempFit, newdata = tempTrain2017, type="link", se.fit = TRUE)
tempTrainPredict <- predict(tempFit, type='link', se.fit=TRUE)
# #store predictions
# predictions_over_doy_unprocessed <-rbind(predictions_over_doy_unprocessed, cbind(data.frame(pred_type=c(rep("train_pred",length(tempOldPredict$fit)),rep("test_pred",length(tempNewPredict$fit)))
# , pred=c(tempOldPredict$fit, tempNewPredict$fit)
# , se = c(tempOldPredict$se.fit, tempNewPredict$se.fit)
# , doy_cutoff = rep(doy, nrow(tck_borrelia_test))
# ),rbind(tempTrain2017,tempTest)))
### Part II: prevalence
# Filter trainig dataset to exclude zeros
tempTrain2 <- tempTrain %>%
cbind(data.frame(pred_bin1=tempTrainPredict$fit)) %>%
filter(pred_bin1>0.5 | borrPresent>0)
tempFit2 <- gam(as.formula(frml.bin2), data=tempTrain2, family = binomial(), method = "REML")
tempNewPredict2 <- predict(tempFit2, newdata = tempTest, type = "link", se.fit = TRUE)
tempOldPredict2 <- predict(tempFit2, newdata = tempTrain2017, type="link", se.fit = TRUE)
Sys.sleep(0.1)
setTxtProgressBar(pb, i, label=paste( round(i/length(daysIn2017)*100, 0),
"% done"))
#store predictions
predictions_over_doy_unprocessed <-rbind(predictions_over_doy_unprocessed, cbind(data.frame(pred_type=c(rep("train_pred",length(tempOldPredict$fit)),rep("test_pred",length(tempNewPredict$fit)))
, pred_bin=c(tempOldPredict$fit, tempNewPredict$fit)
, pred_bin2=c(tempOldPredict2$fit, tempNewPredict2$fit)
, se = c(tempOldPredict$se.fit, tempNewPredict$se.fit)
, se2 = c(tempOldPredict2$se.fit, tempNewPredict2$se.fit)
, doy_cutoff = rep(doy, nrow(tck_borrelia_test))
),rbind(tempTrain2017,tempTest)))
}
## Warning in predict.gam(tempFit, newdata = tempTest, type = "link", se.fit =
## TRUE): factor levels 2017 not in original fit
## Warning in predict.gam(tempFit2, newdata = tempTest, type = "link", se.fit =
## TRUE): factor levels ORNL_002, TALL_001 not in original fit
## Warning in predict.gam(tempFit2, newdata = tempTest, type = "link", se.fit =
## TRUE): factor levels 2017 not in original fit
##
|
|== | 2%
## Warning in predict.gam(tempFit2, newdata = tempTest, type = "link", se.fit =
## TRUE): factor levels ORNL_002, TALL_001 not in original fit
## Warning in predict.gam(tempFit2, newdata = tempTest, type = "link", se.fit =
## TRUE): factor levels 2017 not in original fit
## Warning in predict.gam(tempFit2, newdata = tempTrain2017, type = "link", :
## factor levels 2017 not in original fit
##
|
|=== | 5%
## Warning in predict.gam(tempFit2, newdata = tempTest, type = "link", se.fit =
## TRUE): factor levels ORNL_002, TALL_001 not in original fit
## Warning in predict.gam(tempFit2, newdata = tempTest, type = "link", se.fit =
## TRUE): factor levels 2017 not in original fit
## Warning in predict.gam(tempFit2, newdata = tempTrain2017, type = "link", :
## factor levels 2017 not in original fit
##
|
|===== | 7%
## Warning in predict.gam(tempFit2, newdata = tempTest, type = "link", se.fit =
## TRUE): factor levels ORNL_002, TALL_001 not in original fit
## Warning in predict.gam(tempFit2, newdata = tempTest, type = "link", se.fit =
## TRUE): factor levels 2017 not in original fit
## Warning in predict.gam(tempFit2, newdata = tempTrain2017, type = "link", :
## factor levels TALL_001 not in original fit
## Warning in predict.gam(tempFit2, newdata = tempTrain2017, type = "link", :
## factor levels 2017 not in original fit
##
|
|====== | 9%
## Warning in predict.gam(tempFit2, newdata = tempTest, type = "link", se.fit =
## TRUE): factor levels ORNL_002, TALL_001 not in original fit
## Warning in predict.gam(tempFit2, newdata = tempTrain2017, type = "link", :
## factor levels TALL_001 not in original fit
##
|
|======== | 11%
## Warning in predict.gam(tempFit2, newdata = tempTest, type = "link", se.fit =
## TRUE): factor levels ORNL_002, TALL_001 not in original fit
## Warning in predict.gam(tempFit2, newdata = tempTrain2017, type = "link", :
## factor levels ORNL_002, TALL_001 not in original fit
##
|
|========== | 14%
## Warning in predict.gam(tempFit2, newdata = tempTest, type = "link", se.fit =
## TRUE): factor levels ORNL_002, TALL_001 not in original fit
## Warning in predict.gam(tempFit2, newdata = tempTest, type = "link", se.fit =
## TRUE): factor levels ORNL_002, TALL_001 not in original fit
##
|
|=========== | 16%
## Warning in predict.gam(tempFit2, newdata = tempTest, type = "link", se.fit =
## TRUE): factor levels ORNL_002, TALL_001 not in original fit
## Warning in predict.gam(tempFit2, newdata = tempTest, type = "link", se.fit =
## TRUE): factor levels ORNL_002, TALL_001 not in original fit
##
|
|============= | 18%
## Warning in predict.gam(tempFit2, newdata = tempTest, type = "link", se.fit =
## TRUE): factor levels ORNL_002, TALL_001 not in original fit
## Warning in predict.gam(tempFit2, newdata = tempTest, type = "link", se.fit =
## TRUE): factor levels ORNL_002, TALL_001 not in original fit
##
|
|============== | 20%
## Warning in predict.gam(tempFit2, newdata = tempTest, type = "link", se.fit =
## TRUE): factor levels ORNL_002, TALL_001 not in original fit
## Warning in predict.gam(tempFit2, newdata = tempTest, type = "link", se.fit =
## TRUE): factor levels ORNL_002, TALL_001 not in original fit
##
|
|================ | 23%
## Warning in predict.gam(tempFit2, newdata = tempTest, type = "link", se.fit =
## TRUE): factor levels ORNL_002, TALL_001 not in original fit
## Warning in predict.gam(tempFit2, newdata = tempTest, type = "link", se.fit =
## TRUE): factor levels ORNL_002, TALL_001 not in original fit
##
|
|================== | 25%
## Warning in predict.gam(tempFit2, newdata = tempTest, type = "link", se.fit =
## TRUE): factor levels ORNL_002, TALL_001 not in original fit
## Warning in predict.gam(tempFit2, newdata = tempTest, type = "link", se.fit =
## TRUE): factor levels ORNL_002, TALL_001 not in original fit
##
|
|=================== | 27%
## Warning in predict.gam(tempFit2, newdata = tempTest, type = "link", se.fit =
## TRUE): factor levels ORNL_002, TALL_001 not in original fit
## Warning in predict.gam(tempFit2, newdata = tempTest, type = "link", se.fit =
## TRUE): factor levels ORNL_002, TALL_001 not in original fit
##
|
|===================== | 30%
## Warning in predict.gam(tempFit2, newdata = tempTest, type = "link", se.fit =
## TRUE): factor levels ORNL_002, TALL_001 not in original fit
## Warning in predict.gam(tempFit2, newdata = tempTest, type = "link", se.fit =
## TRUE): factor levels ORNL_002, TALL_001 not in original fit
##
|
|====================== | 32%
## Warning in predict.gam(tempFit2, newdata = tempTest, type = "link", se.fit =
## TRUE): factor levels ORNL_002, TALL_001 not in original fit
## Warning in predict.gam(tempFit2, newdata = tempTest, type = "link", se.fit =
## TRUE): factor levels ORNL_002, TALL_001 not in original fit
##
|
|======================== | 34%
## Warning in predict.gam(tempFit2, newdata = tempTest, type = "link", se.fit =
## TRUE): factor levels ORNL_002, TALL_001 not in original fit
## Warning in predict.gam(tempFit2, newdata = tempTest, type = "link", se.fit =
## TRUE): factor levels ORNL_002, TALL_001 not in original fit
##
|
|========================= | 36%
## Warning in predict.gam(tempFit2, newdata = tempTest, type = "link", se.fit =
## TRUE): factor levels ORNL_002, TALL_001 not in original fit
## Warning in predict.gam(tempFit2, newdata = tempTest, type = "link", se.fit =
## TRUE): factor levels ORNL_002, TALL_001 not in original fit
##
|
|=========================== | 39%
## Warning in predict.gam(tempFit2, newdata = tempTest, type = "link", se.fit =
## TRUE): factor levels ORNL_002, TALL_001 not in original fit
## Warning in predict.gam(tempFit2, newdata = tempTest, type = "link", se.fit =
## TRUE): factor levels ORNL_002, TALL_001 not in original fit
##
|
|============================= | 41%
## Warning in predict.gam(tempFit2, newdata = tempTest, type = "link", se.fit =
## TRUE): factor levels ORNL_002, TALL_001 not in original fit
## Warning in predict.gam(tempFit2, newdata = tempTest, type = "link", se.fit =
## TRUE): factor levels ORNL_002, TALL_001 not in original fit
##
|
|============================== | 43%
## Warning in predict.gam(tempFit2, newdata = tempTest, type = "link", se.fit =
## TRUE): factor levels ORNL_002, TALL_001 not in original fit
## Warning in predict.gam(tempFit2, newdata = tempTest, type = "link", se.fit =
## TRUE): factor levels ORNL_002, TALL_001 not in original fit
##
|
|================================ | 45%
## Warning in predict.gam(tempFit2, newdata = tempTest, type = "link", se.fit =
## TRUE): factor levels TALL_001 not in original fit
## Warning in predict.gam(tempFit2, newdata = tempTrain2017, type = "link", :
## factor levels TALL_001 not in original fit
##
|
|================================= | 48%
## Warning in predict.gam(tempFit2, newdata = tempTest, type = "link", se.fit =
## TRUE): factor levels TALL_001 not in original fit
## Warning in predict.gam(tempFit2, newdata = tempTest, type = "link", se.fit =
## TRUE): factor levels TALL_001 not in original fit
##
|
|=================================== | 50%
## Warning in predict.gam(tempFit2, newdata = tempTest, type = "link", se.fit =
## TRUE): factor levels TALL_001 not in original fit
## Warning in predict.gam(tempFit2, newdata = tempTest, type = "link", se.fit =
## TRUE): factor levels TALL_001 not in original fit
##
|
|===================================== | 52%
## Warning in predict.gam(tempFit2, newdata = tempTest, type = "link", se.fit =
## TRUE): factor levels TALL_001 not in original fit
## Warning in predict.gam(tempFit2, newdata = tempTest, type = "link", se.fit =
## TRUE): factor levels TALL_001 not in original fit
##
|
|====================================== | 55%
## Warning in predict.gam(tempFit2, newdata = tempTest, type = "link", se.fit =
## TRUE): factor levels TALL_001 not in original fit
## Warning in predict.gam(tempFit2, newdata = tempTest, type = "link", se.fit =
## TRUE): factor levels TALL_001 not in original fit
##
|
|======================================== | 57%
## Warning in predict.gam(tempFit2, newdata = tempTest, type = "link", se.fit =
## TRUE): factor levels TALL_001 not in original fit
## Warning in predict.gam(tempFit2, newdata = tempTest, type = "link", se.fit =
## TRUE): factor levels TALL_001 not in original fit
##
|
|========================================= | 59%
## Warning in predict.gam(tempFit2, newdata = tempTest, type = "link", se.fit =
## TRUE): factor levels TALL_001 not in original fit
## Warning in predict.gam(tempFit2, newdata = tempTest, type = "link", se.fit =
## TRUE): factor levels TALL_001 not in original fit
##
|
|=========================================== | 61%
## Warning in predict.gam(tempFit2, newdata = tempTest, type = "link", se.fit =
## TRUE): factor levels TALL_001 not in original fit
## Warning in predict.gam(tempFit2, newdata = tempTest, type = "link", se.fit =
## TRUE): factor levels TALL_001 not in original fit
##
|
|============================================= | 64%
## Warning in predict.gam(tempFit2, newdata = tempTest, type = "link", se.fit =
## TRUE): factor levels TALL_001 not in original fit
## Warning in predict.gam(tempFit2, newdata = tempTest, type = "link", se.fit =
## TRUE): factor levels TALL_001 not in original fit
##
|
|============================================== | 66%
## Warning in predict.gam(tempFit2, newdata = tempTest, type = "link", se.fit =
## TRUE): factor levels TALL_001 not in original fit
## Warning in predict.gam(tempFit2, newdata = tempTest, type = "link", se.fit =
## TRUE): factor levels TALL_001 not in original fit
##
|
|================================================ | 68%
## Warning in predict.gam(tempFit2, newdata = tempTest, type = "link", se.fit =
## TRUE): factor levels TALL_001 not in original fit
## Warning in predict.gam(tempFit2, newdata = tempTest, type = "link", se.fit =
## TRUE): factor levels TALL_001 not in original fit
##
|
|================================================= | 70%
## Warning in predict.gam(tempFit2, newdata = tempTest, type = "link", se.fit =
## TRUE): factor levels TALL_001 not in original fit
## Warning in predict.gam(tempFit2, newdata = tempTest, type = "link", se.fit =
## TRUE): factor levels TALL_001 not in original fit
##
|
|=================================================== | 73%
## Warning in predict.gam(tempFit2, newdata = tempTest, type = "link", se.fit =
## TRUE): factor levels TALL_001 not in original fit
## Warning in predict.gam(tempFit2, newdata = tempTest, type = "link", se.fit =
## TRUE): factor levels TALL_001 not in original fit
##
|
|==================================================== | 75%
## Warning in predict.gam(tempFit2, newdata = tempTest, type = "link", se.fit =
## TRUE): factor levels TALL_001 not in original fit
## Warning in predict.gam(tempFit2, newdata = tempTest, type = "link", se.fit =
## TRUE): factor levels TALL_001 not in original fit
##
|
|====================================================== | 77%
## Warning in predict.gam(tempFit2, newdata = tempTest, type = "link", se.fit =
## TRUE): factor levels TALL_001 not in original fit
## Warning in predict.gam(tempFit2, newdata = tempTest, type = "link", se.fit =
## TRUE): factor levels TALL_001 not in original fit
##
|
|======================================================== | 80%
## Warning in predict.gam(tempFit2, newdata = tempTest, type = "link", se.fit =
## TRUE): factor levels TALL_001 not in original fit
## Warning in predict.gam(tempFit2, newdata = tempTest, type = "link", se.fit =
## TRUE): factor levels TALL_001 not in original fit
##
|
|========================================================= | 82%
## Warning in predict.gam(tempFit2, newdata = tempTest, type = "link", se.fit =
## TRUE): factor levels TALL_001 not in original fit
## Warning in predict.gam(tempFit2, newdata = tempTest, type = "link", se.fit =
## TRUE): factor levels TALL_001 not in original fit
##
|
|=========================================================== | 84%
## Warning in predict.gam(tempFit2, newdata = tempTest, type = "link", se.fit =
## TRUE): factor levels TALL_001 not in original fit
## Warning in predict.gam(tempFit2, newdata = tempTest, type = "link", se.fit =
## TRUE): factor levels TALL_001 not in original fit
##
|
|============================================================ | 86%
## Warning in predict.gam(tempFit2, newdata = tempTest, type = "link", se.fit =
## TRUE): factor levels TALL_001 not in original fit
## Warning in predict.gam(tempFit2, newdata = tempTest, type = "link", se.fit =
## TRUE): factor levels TALL_001 not in original fit
##
|
|============================================================== | 89%
## Warning in predict.gam(tempFit2, newdata = tempTest, type = "link", se.fit =
## TRUE): factor levels TALL_001 not in original fit
## Warning in predict.gam(tempFit2, newdata = tempTest, type = "link", se.fit =
## TRUE): factor levels TALL_001 not in original fit
##
|
|================================================================ | 91%
|
|================================================================= | 93%
|
|=================================================================== | 95%
|
|==================================================================== | 98%
|
|======================================================================| 100%
predictions_over_doy <- predictions_over_doy_unprocessed %>%
filter(!is.na(pred_bin))
# Summarize some MSE stats
MSE_summary <- predictions_over_doy %>%
mutate(ressq_bin = (inv.logit(pred_bin)-borrPresent)^2
, ressq_bin2 = (inv.logit(pred_bin2)-proportionPositive)^2
, predCount_bin2 = inv.logit(pred_bin2)*numberTested
, ressq_countbin2 = (inv.logit(predCount_bin2)-numberPositive)^2) %>%
group_by(pred_type, doy_cutoff) %>%
summarize(MSE_bin=sum(ressq_bin)/n()
, MSE_bin2=sum(ressq_bin2)/n()
, MSE_countbin2= sum(ressq_countbin2)/n()
, predAve_bin = sum(borrPresent)/n()
, predAve_bin2 = mean(inv.logit(pred_bin2))
, predAveCount_bin2 = mean(inv.logit(predCount_bin2))) %>%
ungroup()
## Get number of correct in predictions
summedCorrect <- predictions_over_doy %>%
mutate(pred_borrPresent=ifelse(inv.logit(pred_bin)>0.5,1,0)
, correct = borrPresent==pred_borrPresent
, falsePositive = ifelse(borrPresent==0 & pred_borrPresent==1, 1, 0)
, falseNegative = ifelse (borrPresent==1 & pred_borrPresent==0,1,0)) %>%
group_by(doy_cutoff, pred_type) %>%
summarize(nCorrect = sum(correct, na.rm=TRUE)
, nFPositive = sum(falsePositive, na.rm=TRUE)
, nFNegative = sum(falseNegative, na.rm=TRUE)
, n = n()) %>%
mutate(propCorrect = nCorrect/n
, propFPositive = nFPositive/n
, propFNegative = nFNegative/n)
## Binomial1
MSE_summary %>%
mutate(dayOfYear_cutoff = as.Date(doy_cutoff, origin = "2017-01-01")
, month = month(dayOfYear_cutoff)) %>%
ggplot() + geom_line(aes(x=dayOfYear_cutoff, y=MSE_bin, col=pred_type)) +
scale_y_continuous(sec.axis = sec_axis(trans=~./0.5, name = "Borrelia presence/absence"), limits = c(0,0.5)) +
geom_point(data=tck_borrelia_test, aes(x=as.Date(dayOfYear, origin = "2017-01-01"), y=as.numeric(borrPresent)*0.5)) +
geom_smooth(data=tck_borrelia_test, aes(x=as.Date(dayOfYear, origin = "2017-01-01"), y=borrPresent*0.5))
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning: Removed 4 rows containing missing values (geom_smooth).

## Binomial 2- proportions
MSE_summary %>%
mutate(dayOfYear_cutoff = as.Date(doy_cutoff, origin = "2017-01-01")
, month = month(dayOfYear_cutoff)) %>%
ggplot() + geom_line(aes(x=dayOfYear_cutoff, y=MSE_bin2, col=pred_type)) +
geom_point(data=tck_borrelia_test, aes(x=as.Date(dayOfYear, origin = "2017-01-01"), y=proportionPositive/5)) +
geom_smooth(data=tck_borrelia_test, aes(x=as.Date(dayOfYear, origin = "2017-01-01"), y=proportionPositive/5)) +
scale_y_continuous(sec.axis = sec_axis(trans=~.*5, name = "Proportion positive for Borrelia"), limits=c(0,0.1))
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning: Removed 2 rows containing non-finite values (stat_smooth).
## Warning: Removed 2 rows containing missing values (geom_point).
## Warning: Removed 5 rows containing missing values (geom_smooth).

## Binomial 2- counts
MSE_summary %>%
mutate(dayOfYear_cutoff = as.Date(doy_cutoff, origin = "2017-01-01")
, month = month(dayOfYear_cutoff)) %>%
ggplot() + geom_line(aes(x=dayOfYear_cutoff, y=MSE_countbin2, col=pred_type)) +
geom_point(data=tck_borrelia_test, aes(x=as.Date(dayOfYear, origin = "2017-01-01"), y=log(numberPositive)*500)) +
geom_smooth(data=tck_borrelia_test, aes(x=as.Date(dayOfYear, origin = "2017-01-01"), y=log(numberPositive)*500)) +
scale_y_continuous(sec.axis = sec_axis(trans=~./500, name = "log Number positive for Borrelia"))
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning: Removed 85 rows containing non-finite values (stat_smooth).

summedCorrect %>%
mutate(dayOfYear_cutoff = as.Date(doy_cutoff, origin = "2017-01-01")
, month = month(dayOfYear_cutoff)) %>%
ggplot() + geom_point(aes(x=dayOfYear_cutoff, y=propCorrect, col=pred_type))

summedCorrect %>%
mutate(dayOfYear_cutoff = as.Date(doy_cutoff, origin = "2017-01-01")
, month = month(dayOfYear_cutoff)) %>%
ggplot() + geom_point(aes(x=dayOfYear_cutoff, y=propFPositive, col=pred_type))

summedCorrect%>%
mutate(dayOfYear_cutoff = as.Date(doy_cutoff, origin = "2017-01-01")
, month = month(dayOfYear_cutoff)) %>%
ggplot() + geom_point(aes(x=dayOfYear_cutoff, y=propFNegative, col=pred_type))

#### Part IV: Simulate predictions and assess predictive accuracy ####
# First, make a column of sample names for samples
predictions_over_doy <- predictions_over_doy %>%
mutate(rown = 1:n()
,SimSample = paste0("V",rown))
nsim <- 1000
countPred_sim <- matrix(ncol=nrow(predictions_over_doy), nrow=nsim)
propPred_sim <- matrix(ncol=nrow(predictions_over_doy), nrow=nsim)
bin1_sim_all <- matrix(ncol=nrow(predictions_over_doy), nrow=nsim)
bin2_sim_all <- matrix(ncol=nrow(predictions_over_doy), nrow=nsim)
if ( FALSE ) {
pb <- txtProgressBar(title = "progress bar", min = 0,
max = nrow(predictions_over_doy), style=3)
for ( r in 1:nrow(predictions_over_doy)) {
expbin_sim <- rnorm(n=nsim, mean=predictions_over_doy[r,"pred_bin"], sd=predictions_over_doy[r,"se"] )
bin_sim <- rbinom(n=nsim,size=1,prob = inv.logit(expbin_sim) )
expbin2_sim <- rnorm(n=nsim, mean=predictions_over_doy[r,"pred_bin2"], sd=predictions_over_doy[r,"se2"] )
bin2_sim <- rbinom(n=nsim, size=predictions_over_doy[r,"numberTested"], prob=inv.logit(expbin2_sim))
exp_count_combined <- bin_sim*bin2_sim
exp_prop_combined <- exp_count_combined/predictions_over_doy[r,"numberTested"]
# Save
bin1_sim_all[,r] <- bin_sim
bin2_sim_all[,r] <- bin2_sim
countPred_sim[,r] <- exp_count_combined
Sys.sleep(0.1)
setTxtProgressBar(pb, r)
}
save(bin1_sim_all, file="bin1_sim_all.RData")
save(bin2_sim_all, file="bin2_sim_all.RData")
save(countPred_sim, file="countPred_sim.RData")
} else {
load("bin1_sim_all.RData")
load("bin2_sim_all.RData")
load("countPred_sim.RData")
}
countPred_sim_long <- countPred_sim %>%
as.data.frame() %>%
mutate(SimID = 1:nsim) %>%
gather(-SimID,key=SimSample, value=sim_count)
# propPred_sim_long <- propPred_sim %>%
# as.data.frame() %>%
# gather(key=SampleID, value=sim_prop)
if (FALSE) {
sim_results <- predictions_over_doy %>%
full_join(countPred_sim_long) %>%
mutate(prop_sim = sim_count/numberTested
,SampleID = paste0(plotID,year,dayOfYear, sep="_"))
save(sim_results, file="sim_results.RData")
} else {
load("sim_results.RData")
}
sim_results_summarized <- sim_results %>%
group_by(SampleID, numberPositive, numberTested, proportionPositive, pred_type, pred_bin, pred_bin2, doy_cutoff) %>%
summarise(CI97.5_count=quantile(sim_count, probs=0.975)
, CI2.5_count=quantile(sim_count, probs=0.025)
, sd_count=sd(sim_count)
, mean_count=mean(sim_count)
, CI97.5_prop=quantile(prop_sim, probs=0.975)
, CI2.5_prop=quantile(prop_sim, probs=0.025)
, sd_prop=sd(prop_sim)
, mean_prop=mean(prop_sim)) %>%
ungroup() %>%
mutate(res_count = numberPositive-mean_count)
# count_sim_results$doy_cutoff
# No CI bars; logged
sim_results_summarized %>%
ggplot() + geom_point(aes(x=log(numberPositive+1), y=log(mean_count+1), col=pred_type)) +
geom_abline(aes(intercept=0,slope=1))

# Within 95% confidence intervals?
sim_results_summarized %>%
filter(pred_type=="test_pred") %>%
mutate(inCI = (numberPositive>=CI2.5_count & numberPositive<=CI97.5_count)) %>%
select(doy_cutoff, inCI, numberPositive, CI2.5_count, CI97.5_count) %>%
group_by(doy_cutoff) %>%
summarize(propInCI95 = sum(inCI)/n()) %>%
ungroup() %>%
mutate(Cutoff_for_training = as.Date(doy_cutoff, origin="2017-01-01")) %>%
ggplot()+ geom_line(aes(x=Cutoff_for_training, y=propInCI95)) +
ylab("Proportion of observations in 95% CIs") +
xlab("Date of cutoff for training model")

#
# library("brms")
#
# #### BRMS: Part I #####
# if (FALSE ) {
# brm_bin1 <- brm(bf(frml.bin1)
# , seed=48
# , data=tck_borrelia_adj
# , family=bernoulli
# # , prior = gambin_priors
# , control=list(adapt_delta=0.99, max_treedepth=15)
# )
# save(brm_bin1, file="brm_bin1.RData")
# # save(brm_bin_noelev, file="brm_bin_noelev.RData")
#
# } else {
# load("brm_bin1.RData")
# # load("brm_bin_noelev.RData")
# }
# summary(brm_bin1)
# plot(brm_bin1)
# conditional_effects(brm_bin1, effects = "dayOfYear")
# conditional_effects(brm_bin1, effects = "lognymphDensity")
#
#
# # Look at how estimated probability maps to actual data for borrelia
# fitted(brm_bin1) %>%
# cbind(brm_bin1$data) %>%
# ggplot() +geom_point(aes(x=Estimate, y=borrPresent))
#
# # Inspect error rate (false positive and false negative)
# predict(brm_bin1) %>%
# cbind(brm_bin1$data) %>%
# mutate(Pred = ifelse(Estimate>0.5, 1, 0)) %>%
# select(borrPresent, Pred) %>% table()
# 12/(12+194) # false positive rate
# 29/(29+73) # false negative rate
#
# # See if there is correlation between probability and how many positives there actually were
# fitted(brm_bin1) %>%
# cbind(tck_borrelia_adj[,c("proportionPositive","numberPositive", "numberTested")]) %>%
# ggplot() + geom_point(aes(x=numberPositive, y=Estimate, cex=numberTested))
#
# ## Look at effect of various predictors
# # Effect of day of year
# fitted(brm_bin1) %>%
# cbind(brm_bin1$data) %>%
# left_join(tck_borrelia_adj) %>%
# ggplot() + geom_point(aes(x=dayOfYear, y=Estimate, col=nlcdClass)) + geom_point(aes(x=dayOfYear, y=borrPresent), col="red",alpha=0.2) + geom_smooth(aes(x=dayOfYear, y=borrPresent))
# conditional_smooths(brm_bin1, smooths="s(dayOfYear)")
#
# # Effect of tck density
# fitted(brm_bin1) %>%
# cbind(brm_bin1$data) %>%
# left_join(tck_borrelia_adj) %>%
# ggplot() + geom_point(aes(x=lognymphDensity, y=Estimate, col=domainID)) + geom_point(aes(x=lognymphDensity, y=borrPresent), col="red",alpha=0.2) + geom_smooth(aes(x=lognymphDensity, y=borrPresent))
# conditional_smooths(brm_bin1, smooths="s(lognymphDensity)")
#
# # Since each sample is IID, we can include all positive results in poisson component of model, and use fitted probabilities to determine
# # whether each zero is a "binomial" zero or a "poisson" zero.
# tck_borrelia_filtbin_brm <- tck_borrelia_adj %>%
# cbind(predict(brm_bin1)) %>%
# filter(borrPresent>0 | Estimate>0.5)
#
# ## Now try to fit a second distribution?
# tck_borrelia_filtbin_brm %>%
# ggplot() + geom_histogram(aes(x=proportionPositive), bins=20)
# tck_borrelia_filtbin_brm %>%
# ggplot() + geom_histogram(aes(x=numberPositive), bins=20)
#
# ##### BRMS: Part II ####
# # Re-format frml for brm
# frml_bin2_adj <- gsub(pattern="cbind(numberPositive,numberTested)",replacement="numberPositive | trials(numberTested)",frml_bin2_bestDevexpl, fixed = TRUE)
#
# if (FALSE) {
# brm_bin2 <- brm(bf(as.formula(frml_bin2_adj))
# , data=tck_borrelia_filtbin_brm
# , family=binomial()
# , seed=24
# , control = list(adapt_delta=0.95, max_treedepth=15)
# )
# save(brm_bin2, file = "brm_bin2.RData")
# } else {
# load("brm_bin2.RData")
# }
# summary(brm_bin2)
#
# plot(brm_bin2)
# conditional_smooths(brm_bin2, smooths = "s(dayOfYear)")
# conditional_smooths(brm_bin2, smooths = "s(lognymphDensity, sp = 1)")
# conditional_smooths(brm_bin2, smooths = "s(logadultDensity, sp = 1)")
#
# conditional_effects(brm_bin2, effects = "dayOfYear")
# conditional_effects(brm_bin2, effects = "logadultDensity")
# conditional_effects(brm_bin2, effects = "lognymphDensity")
#
#
# predict(brm_bin2) %>%
# cbind(tck_borrelia_filtbin_brm[,c("numberPositive","dayOfYear","numberTested","logNLtckDensity","year","plotID")]) %>%
# ggplot() + geom_point(aes(x=log(numberPositive+1), y=log(Estimate+1))) +
# geom_segment(aes(x=log(numberPositive+1), xend=log(numberPositive+1), y=log(Q2.5+1), yend=log(Q97.5+1)), col="red")
#
#
#
#
#
#